import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_classif
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
path1 = "/kaggle/input/student-performance-data-set/student/student-mat.csv"
data0 = pd.read_csv(path1, sep = ";")
pd.set_option('display.max_columns', None)
display(data0.head(), data0.info(), data0.describe())
data0.shape
path2 = "/kaggle/input/student-performance-data-set/student/student.txt"
file = open(path2, "r")
meta = file.read()
print(meta)
data1 = data0.copy()
data1[data1.columns[15:23]] = data1[data1.columns[15:23]].replace({"yes": 1, "no": 0})
data1["famsize"] = data1["famsize"].replace({"LE3": 0, "GT3": 1})
data1.head().transpose()
msno.matrix(data1)
data1.duplicated().sum()
# plt.savefig("/kaggle/working/na_plot")
from pandas_profiling import ProfileReport
import IPython
report = ProfileReport(data1)
IPython.display.display(report)
X = data1.copy()
X["Mjob"] = X["Mjob"].replace(X["Mjob"].value_counts().index, [1,2,0,4,3])
X["Fjob"] = X["Fjob"].replace(X["Fjob"].value_counts().index, [1,2,4,0,3])
X["reason"] = X["reason"].replace(X["reason"].value_counts().index, [3,0,1,2])
X["difG"] = X["G2"] - X["G1"]
X.drop(columns="G2", inplace=True)
sns.pairplot(X.loc[:,["absences", "G1", "difG", "G3"]], kind="reg")
y = X.pop("G3")
X_train, X_uji, y_train, y_uji = train_test_split(X, y, test_size = 0.24, random_state = 42)
display(X_train, X_uji)
X1 = X_train.copy()
y1 = y_train.copy()
kat_kol = X1.select_dtypes("object").columns
num_kol = X1.select_dtypes(exclude="object").columns
# ordinal encoder
kat_trf1 = Pipeline(steps=[
("kat_en", OrdinalEncoder())
])
# one hot encoder
kat_trf2 = Pipeline(steps=[
("kat_en", OneHotEncoder(drop="if_binary"))
])
num_trf = Pipeline(steps=[("num_im", SimpleImputer(strategy="median"))])
# merge preprocessing
pre_1 = ColumnTransformer(transformers=[
('kat', kat_trf1, kat_kol),
('num', num_trf, num_kol)
])
pre_2 = ColumnTransformer(transformers=[
('kat', kat_trf2, kat_kol),
('num', num_trf, num_kol)
])
# feature importances, cant do for 2 label, right?
def make_mi_scores(X, y, rs=11, ma=pre_1, use=True):
X2 = ma.fit_transform(X)
kol = X.columns
if use:
kol = [j.split("__")[1] for j in ma.get_feature_names_out()]
mi_scores = mutual_info_classif(X2, y, discrete_features=True, random_state=rs)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=kol)
mi_scores = mi_scores.sort_values(ascending=False)
mm = mi_scores
plt.figure(figsize=(8,8))
ax = plt.barh(y=mm.index, width=mm)
plt.xlabel("MI Scores")
return mi_scores
display(make_mi_scores(X1, y1)[:-10])
kolom2 = make_mi_scores(X1, y1)[:-10].index
kolom2
# plt.savefig("/kaggle/working/mi")
X2 = X1[kolom2]
y2 = y1.copy()
kat_kol2 = X2.select_dtypes("object").columns
num_kol2 = X2.select_dtypes(exclude="object").columns
# ordinal encoder
kat_trf2 = Pipeline(steps=[
("kat_en", OrdinalEncoder())
])
num_trf2 = Pipeline(steps=[("num_im", SimpleImputer(strategy="median"))])
# merge preprocessing
pre2 = ColumnTransformer(transformers=[
('kat', kat_trf2, kat_kol2),
('num', num_trf2, num_kol2)
])
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as imbpipeline
import pandas as pd
# FOLD
rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=0)
# Define a list of regression models
models = [
LinearRegression(),
Ridge(random_state=42),
Lasso(random_state=42),
ElasticNet(random_state=42),
SVR(),
KNeighborsRegressor(),
DecisionTreeRegressor(random_state=42),
RandomForestRegressor(random_state=42),
GradientBoostingRegressor(random_state=42),
XGBRegressor(random_state=42),
LGBMRegressor(random_state=42),
]
res1 = []
for model in models:
pipa = imbpipeline(steps=[
("preprocess", pre_1),
("clf", model)
])
grid_search = GridSearchCV(estimator=pipa, param_grid = {}, cv=rkf, scoring='neg_root_mean_squared_error', verbose=1)
grid_search.fit(X1, y1)
# Save the model's performance metrics in a dictionary
result = {
'Model': type(model).__name__,
'RMSE1': grid_search.best_score_
}
# Append the dictionary to the results list
res1.append(result)
# Define a list of regression models
models = [
LinearRegression(),
Ridge(random_state=42),
Lasso(random_state=42),
ElasticNet(random_state=42),
SVR(),
KNeighborsRegressor(),
DecisionTreeRegressor(random_state=42),
RandomForestRegressor(random_state=42),
GradientBoostingRegressor(random_state=42),
XGBRegressor(random_state=42),
LGBMRegressor(random_state=42),
]
res2 = []
for model in models:
pipa = imbpipeline(steps=[
("preprocess", pre2),
("clf", model)
])
grid_search = GridSearchCV(estimator=pipa, param_grid = {}, cv=rkf, scoring='neg_root_mean_squared_error', verbose=1)
grid_search.fit(X2, y1)
# Save the model's performance metrics in a dictionary
result = {
'Model': type(model).__name__,
'RMSE2': grid_search.best_score_
}
# Append the dictionary to the results list
res2.append(result)
pass
performa = pd.DataFrame(res1).join(pd.DataFrame(res2).set_index("Model"), on="Model")
performa.sort_values("RMSE1", ascending=False)
GradientBoostingRegressor¶
gbm1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", GradientBoostingRegressor(random_state=0))
])
gbm2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", GradientBoostingRegressor(random_state=0))
])
param_grid = {
'clf__n_estimators': [1000],
'clf__learning_rate': [0.01],
'clf__max_depth': [5],
'clf__min_samples_split': [6, 8],
'clf__min_samples_leaf': [3],
'clf__subsample': [0.8]
}
grid_gbm1 = GridSearchCV(estimator=gbm1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__n_estimators': [800],
'clf__learning_rate': [0.01],
'clf__max_depth': [5],
'clf__min_samples_split': [6],
'clf__min_samples_leaf': [3],
'clf__subsample': [0.8]
}
grid_gbm2 = GridSearchCV(estimator=gbm2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_gbm1.fit(X1, y1)
grid_gbm2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_gbm1.best_params_)
print("Best Scorer", grid_gbm1.best_score_)
print("Best Parameter", grid_gbm2.best_params_)
print("Best Scorer", grid_gbm2.best_score_)
Best Parameter {'clf__learning_rate': 0.01, 'clf__max_depth': 5, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 8, 'clf__n_estimators': 1000, 'clf__subsample': 0.8}
Best Scorer -1.7181985438456961
Best Parameter {'clf__learning_rate': 0.01, 'clf__max_depth': 5, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 6, 'clf__n_estimators': 800, 'clf__subsample': 0.8}
Best Scorer -1.7439000375229905
display(pd.DataFrame(grid_gbm1.cv_results_).filter(regex='^mean',axis=1),
pd.DataFrame(grid_gbm2.cv_results_).filter(regex='^mean',axis=1))
XGB¶
xgb1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", XGBRegressor(random_state=0))
])
xgb2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", XGBRegressor(random_state=0))
])
param_grid = {
'clf__n_estimators': [1000],
'clf__learning_rate': [0.01],
'clf__max_depth': [5,7],
'clf__subsample': [0.8]
}
grid_xgb1 = GridSearchCV(estimator=xgb1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__n_estimators': [1000],
'clf__learning_rate': [0.01],
'clf__max_depth': [5,7],
'clf__subsample': [0.8]
}
grid_xgb2 = GridSearchCV(estimator=xgb2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_xgb1.fit(X1, y1)
grid_xgb2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_xgb1.best_params_)
print("Best Scorer", grid_xgb1.best_score_)
print("Best Parameter", grid_xgb2.best_params_)
print("Best Scorer", grid_xgb2.best_score_)
Best Parameter {'clf__learning_rate': 0.01, 'clf__max_depth': 7, 'clf__n_estimators': 1000, 'clf__subsample': 0.8}
Best Scorer -1.6934167831414946
Best Parameter {'clf__learning_rate': 0.01, 'clf__max_depth': 7, 'clf__n_estimators': 1000, 'clf__subsample': 0.8}
Best Scorer -1.6527267233074798
display(pd.DataFrame(grid_xgb1.cv_results_).filter(regex='^mean',axis=1),
pd.DataFrame(grid_xgb2.cv_results_).filter(regex='^mean',axis=1))
Random Forest¶
rf1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", RandomForestRegressor(random_state=0))
])
rf2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", RandomForestRegressor(random_state=0,))
])
param_grid = {
'clf__n_estimators': [50, 100],
'clf__min_samples_split': [4,5]
}
grid_rf1 = GridSearchCV(estimator=rf1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__n_estimators': [50, 100],
'clf__min_samples_split': [4,5]
}
grid_rf2 = GridSearchCV(estimator=rf2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_rf1.fit(X1, y1)
grid_rf2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_rf1.best_params_)
print("Best Scorer", grid_rf1.best_score_)
print("Best Parameter", grid_rf2.best_params_)
print("Best Scorer", grid_rf2.best_score_)
1.9164Best Parameter {'clf__min_samples_split': 5, 'clf__n_estimators': 50}
Best Scorer -1.9122164039296181
Best Parameter {'clf__min_samples_split': 4, 'clf__n_estimators': 50}
Best Scorer -1.9164153265604813
display(pd.DataFrame(grid_rf1.cv_results_).filter(regex='^mean',axis=1),
pd.DataFrame(grid_rf2.cv_results_).filter(regex='^mean',axis=1))
Elastic Net¶
en1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", ElasticNet(random_state=0,))
])
en2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", ElasticNet(random_state=0))
])
param_grid = {
'clf__alpha': [0.2, 0.3],
'clf__l1_ratio': [0.8],
'clf__fit_intercept': [False]
}
grid_en1 = GridSearchCV(estimator=en1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__alpha': [0.2, 0.3],
'clf__l1_ratio': [0.2],
'clf__fit_intercept': [False]
}
grid_en2 = GridSearchCV(estimator=en2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_en1.fit(X1, y1)
grid_en2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_en1.best_params_)
print("Best Scorer", grid_en1.best_score_)
print("Best Parameter", grid_en2.best_params_)
print("Best Scorer", grid_en2.best_score_)
Best Parameter {'clf__alpha': 0.2, 'clf__fit_intercept': False, 'clf__l1_ratio': 0.8}
Best Scorer -1.792554560369446
Best Parameter {'clf__alpha': 0.3, 'clf__fit_intercept': False, 'clf__l1_ratio': 0.2}
Best Scorer -1.8028888029814474
display(pd.DataFrame(grid_en1.cv_results_).filter(regex='^mean',axis=1),
pd.DataFrame(grid_en2.cv_results_).filter(regex='^mean',axis=1))
Ridge¶
rr1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", Ridge(random_state=0))
])
rr2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", Ridge(random_state=0))
])
param_grid = {
'clf__alpha': [10, 100, 1000],
'clf__solver': ['lsqr', 'sag', 'saga'],
'clf__fit_intercept': [True, False]
}
grid_rr1 = GridSearchCV(estimator=rr1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__alpha': [10, 100, 1000],
'clf__solver': ['lsqr', 'sag', 'saga'],
'clf__fit_intercept': [True, False]
}
grid_rr2 = GridSearchCV(estimator=rr2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_rr1.fit(X1, y1)
grid_rr2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_rr1.best_params_)
print("Best Scorer", grid_rr1.best_score_)
print("Best Parameter", grid_rr2.best_params_)
print("Best Scorer", grid_rr2.best_score_)
Best Parameter {'clf__alpha': 100, 'clf__fit_intercept': False, 'clf__solver': 'saga'}
Best Scorer -1.80785337865716
Best Parameter {'clf__alpha': 100, 'clf__fit_intercept': False, 'clf__solver': 'saga'}
Best Scorer -1.8049450473228965
LGBM¶
lgb1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", LGBMRegressor(random_state=0))
])
lgb2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", LGBMRegressor(random_state=0))
])
param_grid = {
'clf__learning_rate': [0.1],
'clf__n_estimators': [100,50],
'clf__max_depth': [4,5],
'clf__subsample': [0.6],
'clf__reg_alpha': [0.1],
'clf__reg_lambda': [0.3]
}
grid_lgb1 = GridSearchCV(estimator=lgb1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__learning_rate': [0.1],
'clf__n_estimators': [100, 50],
'clf__max_depth': [4],
'clf__subsample': [0.6],
'clf__reg_alpha': [0.5],
'clf__reg_lambda': [0]
}
grid_lgb2 = GridSearchCV(estimator=lgb2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_lgb1.fit(X1, y1)
grid_lgb2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_lgb1.best_params_)
print("Best Scorer", grid_lgb1.best_score_)
print("Best Parameter", grid_lgb2.best_params_)
print("Best Scorer", grid_lgb2.best_score_)
Best Parameter {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 50, 'clf__reg_alpha': 0.1, 'clf__reg_lambda': 0.3, 'clf__subsample': 0.8}
Best Scorer -1.9291580763393348
Best Parameter {'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'clf__n_estimators': 50, 'clf__reg_alpha': 0.5, 'clf__reg_lambda': 0, 'clf__subsample': 0.8}
Best Scorer -1.920183156636173
Lasso¶
lr1 = imbpipeline(steps=[
("preprocess", pre_1),
("clf", Lasso(random_state=0))
])
lr2 = imbpipeline(steps=[
("preprocess", pre2),
("clf", Lasso(random_state=0))
])
param_grid = {
'clf__alpha': [0.001, 0.01, 0.1, 1, 10],
'clf__fit_intercept': [True, False]
}
grid_lr1 = GridSearchCV(estimator=lr1, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
param_grid = {
'clf__alpha': [0.001, 0.01, 0.1, 1, 10],
'clf__fit_intercept': [True, False]
}
grid_lr2 = GridSearchCV(estimator=lr2, param_grid=param_grid, cv=rkf, scoring={"mse": 'neg_mean_squared_error',
"rsq": 'r2',
"rmse": 'neg_root_mean_squared_error',
"mae": 'neg_mean_absolute_error'},
verbose = 1, refit="rmse")
# Melakukan tuning hyperparameter
grid_lr1.fit(X1, y1)
grid_lr2.fit(X1, y1)
# Mendapatkan model terbaik
print("Best Parameter", grid_lr1.best_params_)
print("Best Scorer", grid_lr1.best_score_)
print("Best Parameter", grid_lr2.best_params_)
print("Best Scorer", grid_lr2.best_score_)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Parameter {'clf__alpha': 0.1, 'clf__fit_intercept': False}
Best Scorer -1.8037960394692867
Best Parameter {'clf__alpha': 0.1, 'clf__fit_intercept': False}
Best Scorer -1.803185235128425
Best Model: XGB
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import explained_variance_score, r2_score
pred1 = grid_xgb1.predict(X_uji)
pred2 = grid_xgb2.predict(X_uji)
metrik = ["RMSE", "MSE", "MAE", "EV", "R2"]
model1 = [mean_squared_error(y_uji, pred1, squared=False)]
model2 = [mean_squared_error(y_uji, pred2, squared=False)]
for j in [mean_squared_error, mean_absolute_error, explained_variance_score, r2_score]:
model1.append(j(y_uji, pred1))
model2.append(j(y_uji, pred2))
pass
# evaluasi = pd.DataFrame({"model1":model1, "model2":model2}, index=metrik).transpose()
evaluasi = pd.DataFrame({"Best Model":model2}, index=metrik).transpose()
evaluasi.iloc[:,3:]= evaluasi.iloc[:,3:]*100
round(evaluasi,2)
# choose the simpler model, not that different.
# GOOD FI
# fitur1 = [j.split("__")[1] for j in grid_xgb1.best_estimator_.named_steps["preprocess"].get_feature_names_out()]
# imp1 = pd.DataFrame({'feature': fitur1,
# 'importance': grid_xgb1.best_estimator_.named_steps["clf"].feature_importances_})
# imp1 = imp1.sort_values('importance', ascending=False).reset_index(drop=True)
# plt.figure(figsize=(10, 8))
# sns.barplot(x='importance', y='feature', data=imp1)
fitur2 = [j.split("__")[1] for j in grid_xgb2.best_estimator_.named_steps["preprocess"].get_feature_names_out()]
imp2 = pd.DataFrame({'feature': fitur2,
'importance': grid_xgb2.best_estimator_.named_steps["clf"].feature_importances_})
imp2 = imp2.sort_values('importance', ascending=False).reset_index(drop=True)
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=imp2)
var = ['reason', 'schoolsup', 'Dalc', 'failures', 'Fedu', 'internet', 'age']
fig, axes = plt.subplots(3, 3, figsize=(20, 16))
for i, var in enumerate(var):
row = i // 3
col = i % 3
if var != 'age':
sns.boxenplot(data=data0, x=var, y='G3', ax=axes[row, col], palette='Paired')
axes[row, col].set_title(f'G3 by {var}', fontsize=12, fontweight='bold')
axes[row, col].set_xlabel(var)
axes[row, col].set_ylabel('G3')
axes[row, col].tick_params(axis='x', rotation=45)
else:
sns.boxenplot(data=data0, x=var, y='G3', ax=axes[2, 1], palette='Paired')
axes[2, 1].set_title(f'G3 by {var}', fontsize=12, fontweight='bold')
axes[2, 1].set_xlabel(var)
axes[2, 1].set_ylabel('G3')
axes[2, 1].tick_params(axis='x', rotation=45)
axes[2, 0].axis('off')
axes[2, 2].axis('off')
plt.tight_layout()
plt.savefig('viz1.png', dpi=300)
plt.show()